Reference

https://bradleyboehmke.github.io/HOML/engineering.html#proper-implementation

Libraries and functions

library(here) # Comentar
library(tidyverse)
library(janitor) # Clean names
library(skimr) # Beautiful Summarize
library(magrittr) # Pipe operators
library(corrplot) # Correlations
library(ggcorrplot)  # Correlations
library(PerformanceAnalytics) # Correlations
library(leaps) # Model selection

Read Data

raw_data <-  read.csv("nba.csv")
colnames(raw_data)
##  [1] "Player"          "Salary"          "NBA_Country"     "NBA_DraftNumber"
##  [5] "Age"             "Tm"              "G"               "MP"             
##  [9] "PER"             "TS."             "X3PAr"           "FTr"            
## [13] "ORB."            "DRB."            "TRB."            "AST."           
## [17] "STL."            "BLK."            "TOV."            "USG."           
## [21] "OWS"             "DWS"             "WS"              "WS.48"          
## [25] "OBPM"            "DBPM"            "BPM"             "VORP"

Variables Names

raw_data %<>% clean_names()
colnames(raw_data)
##  [1] "player"           "salary"           "nba_country"      "nba_draft_number"
##  [5] "age"              "tm"               "g"                "mp"              
##  [9] "per"              "ts"               "x3p_ar"           "f_tr"            
## [13] "orb"              "drb"              "trb"              "ast"             
## [17] "stl"              "blk"              "tov"              "usg"             
## [21] "ows"              "dws"              "ws"               "ws_48"           
## [25] "obpm"             "dbpm"             "bpm"              "vorp"

Summarize Data

skim(raw_data)
Data summary
Name raw_data
Number of rows 485
Number of columns 28
_______________________
Column type frequency:
character 3
numeric 25
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
player 0 1 7 24 0 483 0
nba_country 0 1 3 16 0 44 0
tm 0 1 3 3 0 31 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
salary 0 1 6636507.50 7392601.91 46080.00 1471382.00 3202217.00 1.000e+07 34682550.00 ▇▂▁▁▁
nba_draft_number 0 1 29.45 21.13 1.00 11.00 25.00 4.700e+01 62.00 ▇▆▃▃▆
age 0 1 26.26 4.27 19.00 23.00 26.00 2.900e+01 41.00 ▇▇▆▂▁
g 0 1 50.17 24.87 1.00 29.00 59.00 7.100e+01 79.00 ▃▂▂▃▇
mp 0 1 1154.14 811.36 1.00 381.00 1134.00 1.819e+03 2898.00 ▇▅▆▅▂
per 0 1 13.26 8.77 -41.10 9.80 13.20 1.650e+01 134.10 ▁▇▁▁▁
ts 2 1 0.54 0.11 0.00 0.51 0.54 5.800e-01 1.50 ▁▇▂▁▁
x3p_ar 2 1 0.34 0.23 0.00 0.17 0.35 4.800e-01 1.00 ▇▇▇▂▁
f_tr 2 1 0.26 0.29 0.00 0.16 0.23 3.200e-01 5.33 ▇▁▁▁▁
orb 0 1 4.87 4.58 0.00 1.80 3.20 7.000e+00 35.90 ▇▂▁▁▁
drb 0 1 14.95 6.85 0.00 10.20 14.00 1.880e+01 37.60 ▂▇▅▂▁
trb 0 1 9.91 4.96 0.00 6.20 8.70 1.330e+01 26.50 ▂▇▃▂▁
ast 0 1 12.95 9.11 0.00 6.90 9.90 1.760e+01 49.40 ▇▅▂▁▁
stl 0 1 1.53 0.99 0.00 1.00 1.50 1.900e+00 12.50 ▇▁▁▁▁
blk 0 1 1.71 1.68 0.00 0.60 1.20 2.200e+00 13.40 ▇▂▁▁▁
tov 2 1 13.14 6.12 0.00 9.90 12.50 1.575e+01 66.70 ▇▆▁▁▁
usg 0 1 18.90 5.94 0.00 15.00 17.90 2.220e+01 45.10 ▁▇▆▁▁
ows 0 1 1.28 1.88 -2.30 0.00 0.80 2.000e+00 11.40 ▇▇▂▁▁
dws 0 1 1.18 1.03 0.00 0.30 1.00 1.800e+00 5.60 ▇▅▂▁▁
ws 0 1 2.46 2.67 -1.20 0.30 1.80 3.600e+00 15.00 ▇▅▁▁▁
ws_48 0 1 0.08 0.16 -1.06 0.04 0.08 1.200e-01 2.71 ▁▇▁▁▁
obpm 0 1 -1.27 5.03 -36.50 -2.70 -1.10 4.000e-01 68.70 ▁▇▁▁▁
dbpm 0 1 -0.49 2.39 -14.30 -1.70 -0.40 1.000e+00 6.80 ▁▁▃▇▁
bpm 0 1 -1.76 5.66 -49.20 -3.60 -1.30 5.000e-01 54.40 ▁▁▇▁▁
vorp 0 1 0.60 1.25 -1.30 -0.10 0.10 9.000e-01 8.60 ▇▃▁▁▁

Data Wrangling data

# delete duplicate
# Remove duplicate rows of the dataframe
raw_data %<>% distinct(player,.keep_all= TRUE)

# delete NA's
raw_data %<>% drop_na()

# Summarise
skim(raw_data)
Data summary
Name raw_data
Number of rows 481
Number of columns 28
_______________________
Column type frequency:
character 3
numeric 25
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
player 0 1 7 24 0 481 0
nba_country 0 1 3 16 0 44 0
tm 0 1 3 3 0 31 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
salary 0 1 6682859.45 7405536.17 46080.00 1471382.00 3290000.00 1.00e+07 34682550.00 ▇▂▁▁▁
nba_draft_number 0 1 29.29 21.10 1.00 10.00 24.00 4.70e+01 62.00 ▇▅▃▃▆
age 0 1 26.29 4.27 19.00 23.00 26.00 2.90e+01 41.00 ▇▇▆▂▁
g 0 1 50.52 24.67 1.00 30.00 59.00 7.10e+01 79.00 ▃▂▂▃▇
mp 0 1 1163.17 808.61 1.00 391.00 1155.00 1.83e+03 2898.00 ▇▅▆▅▂
per 0 1 13.36 8.74 -41.10 9.90 13.30 1.66e+01 134.10 ▁▇▁▁▁
ts 0 1 0.54 0.11 0.00 0.51 0.54 5.80e-01 1.50 ▁▇▂▁▁
x3p_ar 0 1 0.34 0.23 0.00 0.17 0.35 4.80e-01 1.00 ▇▇▇▂▁
f_tr 0 1 0.26 0.30 0.00 0.16 0.23 3.20e-01 5.33 ▇▁▁▁▁
orb 0 1 4.91 4.58 0.00 1.80 3.30 7.10e+00 35.90 ▇▂▁▁▁
drb 0 1 15.03 6.80 0.00 10.30 14.00 1.88e+01 37.60 ▂▇▅▂▁
trb 0 1 9.97 4.93 0.00 6.20 8.70 1.33e+01 26.50 ▂▇▃▂▁
ast 0 1 12.96 9.09 0.00 6.90 9.90 1.72e+01 49.40 ▇▅▂▁▁
stl 0 1 1.54 0.99 0.00 1.00 1.50 1.90e+00 12.50 ▇▁▁▁▁
blk 0 1 1.72 1.69 0.00 0.60 1.20 2.20e+00 13.40 ▇▂▁▁▁
tov 0 1 13.12 6.12 0.00 9.90 12.50 1.56e+01 66.70 ▇▆▁▁▁
usg 0 1 18.94 5.81 5.70 15.00 17.90 2.22e+01 45.10 ▂▇▃▁▁
ows 0 1 1.29 1.88 -2.30 0.00 0.80 2.00e+00 11.40 ▇▇▂▁▁
dws 0 1 1.19 1.03 0.00 0.30 1.00 1.80e+00 5.60 ▇▅▂▁▁
ws 0 1 2.48 2.67 -1.20 0.40 1.90 3.60e+00 15.00 ▇▅▁▁▁
ws_48 0 1 0.08 0.16 -1.06 0.04 0.08 1.20e-01 2.71 ▁▇▁▁▁
obpm 0 1 -1.22 5.02 -36.50 -2.60 -1.00 4.00e-01 68.70 ▁▇▁▁▁
dbpm 0 1 -0.48 2.39 -14.30 -1.60 -0.40 1.00e+00 6.80 ▁▁▂▇▁
bpm 0 1 -1.70 5.64 -49.20 -3.50 -1.20 6.00e-01 54.40 ▁▁▇▁▁
vorp 0 1 0.60 1.25 -1.30 -0.10 0.10 9.00e-01 8.60 ▇▃▁▁▁
raw_data %>% 
  select_at(vars(-c("player","nba_country","tm"))) %>% 
  tidyr::gather("id", "value", 2:25) %>% 
  ggplot(., aes(y=salary, x=value))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE, color="black")+
  facet_wrap(~id,ncol=2,scales="free_x")
## `geom_smooth()` using formula 'y ~ x'

raw_data %>% 
  select_at(vars(-c("player","nba_country","tm"))) %>% 
  tidyr::gather("id", "value", 2:25) %>% 
  ggplot(., aes(y=log(salary), x=value))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE, color="black")+
  facet_wrap(~id,ncol=2,scales="free_x")
## `geom_smooth()` using formula 'y ~ x'

EDA

Log salary

log_data <- raw_data %>% mutate(salary=log(salary))

skim(log_data)
Data summary
Name log_data
Number of rows 481
Number of columns 28
_______________________
Column type frequency:
character 3
numeric 25
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
player 0 1 7 24 0 481 0
nba_country 0 1 3 16 0 44 0
tm 0 1 3 3 0 31 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
salary 0 1 14.95 1.49 10.74 14.20 15.01 16.12 17.36 ▂▁▇▆▆
nba_draft_number 0 1 29.29 21.10 1.00 10.00 24.00 47.00 62.00 ▇▅▃▃▆
age 0 1 26.29 4.27 19.00 23.00 26.00 29.00 41.00 ▇▇▆▂▁
g 0 1 50.52 24.67 1.00 30.00 59.00 71.00 79.00 ▃▂▂▃▇
mp 0 1 1163.17 808.61 1.00 391.00 1155.00 1830.00 2898.00 ▇▅▆▅▂
per 0 1 13.36 8.74 -41.10 9.90 13.30 16.60 134.10 ▁▇▁▁▁
ts 0 1 0.54 0.11 0.00 0.51 0.54 0.58 1.50 ▁▇▂▁▁
x3p_ar 0 1 0.34 0.23 0.00 0.17 0.35 0.48 1.00 ▇▇▇▂▁
f_tr 0 1 0.26 0.30 0.00 0.16 0.23 0.32 5.33 ▇▁▁▁▁
orb 0 1 4.91 4.58 0.00 1.80 3.30 7.10 35.90 ▇▂▁▁▁
drb 0 1 15.03 6.80 0.00 10.30 14.00 18.80 37.60 ▂▇▅▂▁
trb 0 1 9.97 4.93 0.00 6.20 8.70 13.30 26.50 ▂▇▃▂▁
ast 0 1 12.96 9.09 0.00 6.90 9.90 17.20 49.40 ▇▅▂▁▁
stl 0 1 1.54 0.99 0.00 1.00 1.50 1.90 12.50 ▇▁▁▁▁
blk 0 1 1.72 1.69 0.00 0.60 1.20 2.20 13.40 ▇▂▁▁▁
tov 0 1 13.12 6.12 0.00 9.90 12.50 15.60 66.70 ▇▆▁▁▁
usg 0 1 18.94 5.81 5.70 15.00 17.90 22.20 45.10 ▂▇▃▁▁
ows 0 1 1.29 1.88 -2.30 0.00 0.80 2.00 11.40 ▇▇▂▁▁
dws 0 1 1.19 1.03 0.00 0.30 1.00 1.80 5.60 ▇▅▂▁▁
ws 0 1 2.48 2.67 -1.20 0.40 1.90 3.60 15.00 ▇▅▁▁▁
ws_48 0 1 0.08 0.16 -1.06 0.04 0.08 0.12 2.71 ▁▇▁▁▁
obpm 0 1 -1.22 5.02 -36.50 -2.60 -1.00 0.40 68.70 ▁▇▁▁▁
dbpm 0 1 -0.48 2.39 -14.30 -1.60 -0.40 1.00 6.80 ▁▁▂▇▁
bpm 0 1 -1.70 5.64 -49.20 -3.50 -1.20 0.60 54.40 ▁▁▇▁▁
vorp 0 1 0.60 1.25 -1.30 -0.10 0.10 0.90 8.60 ▇▃▁▁▁
# Excluded vars (factor)

vars <- c("player","nba_country","tm")

# Correlations
corrplot(cor(log_data %>% 
               select_at(vars(-vars)), 
             use = "complete.obs"), 
         method = "circle",type = "upper")
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(vars)` instead of `vars` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

# Other Correlations


ggcorrplot(cor(log_data %>% 
               select_at(vars(-vars)), 
            use = "complete.obs"),
            hc.order = TRUE,
            type = "lower",  lab = TRUE)

# Other Correlations

chart.Correlation(log_data %>% 
               select_at(vars(-vars)),
               histogram=TRUE, pch=19)

VIF

model_vif <- lm(salary~.-player-nba_country-tm, data=log_data)

vif_values <- car::vif(model_vif)

#create horizontal bar chart to display each VIF value
barplot(vif_values, main = "VIF Values", horiz = TRUE, col = "steelblue")

#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)

knitr::kable(vif_values)
x
nba_draft_number 1.340170
age 1.078945
g 6.999197
mp 14.172245
per 110.918970
ts 6.146914
x3p_ar 5.301979
f_tr 1.264400
orb 317.236811
drb 684.388198
trb 1439.666086
ast 3.445392
stl 3.208627
blk 5.305430
tov 1.917591
usg 6.861721
ows 1329.661494
dws 405.100887
ws 2683.730741
ws_48 67.930181
obpm 10524.307876
dbpm 2307.119717
bpm 12928.849699
vorp 11.551583

Conocimiento del negocio

Modelos no lineales e interacciones

Variables Categoricas

Variable endógena: - Salario: log

Variables exógenas:
- Edad (Age): se presupone que a mayor edad mayor salario - Edad elevado alcuadrado: considero que a partir de cierta edad ya no aumenta el salario con la edad - Número del draft(NBA_DraftNumber): a menor número en el draft mayor salario - Minutos jugados (MP): a mayor númerode minutos jugados mayor salario - Minutos jugados al cuadrado: a partir de un cierto número de minutosjugados ya no aumenta el salario - Eficiencia del jugador: a mayor eficiencia mayor salario - Eficiencia deljugador al cuadrado: a partir de cierto nivel de eficiencia ya no afecta al salario - Contribución a las victorias del equipo: a mayor contribución a las victorias del equipo mayor salario - Contribución a las victorias del equipo al cuadrado: a partir de cierto nivel de aportación a las victorias del equipo ya no afecta al salario - Porcentaje de participación en el juego (USG%): A mayor participación mayor salario - Valor sobre jugadorde reemplazo (VORP): a mayor VORP mayor salario - Valor sobre jugador de reemplazo al cuadrado: a partir de cierto nivel de VORP ya no afecta al salario - Efectividad de tiro (TS%): a mayor efectividad de tiro mayor salario - Efectividad asistencias (AST%): a mayor efectividad de asistencias mayor salario - Interacciónde WS y VORP (WS:VORP): considero que están relacionadas estas dos variables, a mayores valores deWS y VORP mayor será el salario del jugadorA continuación se filtra la base de datos para poder observar sólo las variables que me interesan.

Model Selection

nba <- log_data %>% select_at(vars(-vars))

set.seed(1234)
num_data <- nrow(nba)
num_data_test <- 10
train=sample(num_data ,num_data-num_data_test)


data_train <- nba[train,]
data_test  <-  nba[-train,]

model_select <- regsubsets(salary~. , data =data_train, method = "seqrep",nvmax=24)

model_select_summary <- summary(model_select)

data.frame(
  Adj.R2 = (model_select_summary$adjr2),
  CP = (model_select_summary$cp),
  BIC = (model_select_summary$bic)
)
model_select_summary$outmat
##           nba_draft_number age g   mp  per ts  x3p_ar f_tr orb drb trb ast stl
## 1  ( 1 )  " "              " " " " "*" " " " " " "    " "  " " " " " " " " " "
## 2  ( 1 )  "*"              "*" " " " " " " " " " "    " "  " " " " " " " " " "
## 3  ( 1 )  "*"              "*" "*" " " " " " " " "    " "  " " " " " " " " " "
## 4  ( 1 )  "*"              "*" " " "*" " " " " " "    " "  " " "*" " " " " " "
## 5  ( 1 )  "*"              "*" " " "*" " " " " " "    " "  " " "*" " " " " " "
## 6  ( 1 )  "*"              "*" " " "*" " " " " " "    " "  " " "*" " " " " " "
## 7  ( 1 )  "*"              "*" " " "*" "*" " " " "    " "  " " " " "*" " " " "
## 8  ( 1 )  "*"              "*" " " "*" "*" "*" " "    " "  " " "*" " " " " " "
## 9  ( 1 )  "*"              "*" " " "*" "*" "*" " "    " "  " " " " "*" " " " "
## 10  ( 1 ) "*"              "*" " " "*" "*" "*" "*"    " "  " " "*" " " " " " "
## 11  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" "*" "*" " " " "
## 12  ( 1 ) "*"              "*" " " "*" "*" "*" "*"    " "  " " "*" " " " " " "
## 13  ( 1 ) "*"              "*" "*" "*" "*" "*" " "    " "  " " " " "*" "*" " "
## 14  ( 1 ) "*"              "*" "*" "*" "*" "*" " "    "*"  " " " " "*" "*" " "
## 15  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" "*" "*" "*" "*"
## 16  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  " " " " "*" "*" " "
## 17  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" " " "*" "*" " "
## 18  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" " " "*" "*" " "
## 19  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" "*" "*" "*" "*"
## 20  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" " " "*" "*" "*"
## 21  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" " " "*" "*" "*"
## 22  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" "*" "*" "*" "*"
## 23  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" "*" "*" "*" "*"
## 24  ( 1 ) "*"              "*" "*" "*" "*" "*" "*"    "*"  "*" "*" "*" "*" "*"
##           blk tov usg ows dws ws  ws_48 obpm dbpm bpm vorp
## 1  ( 1 )  " " " " " " " " " " " " " "   " "  " "  " " " " 
## 2  ( 1 )  " " " " " " " " " " " " " "   " "  " "  " " " " 
## 3  ( 1 )  " " " " " " " " " " " " " "   " "  " "  " " " " 
## 4  ( 1 )  " " " " " " " " " " " " " "   " "  " "  " " " " 
## 5  ( 1 )  " " " " " " " " " " " " " "   " "  "*"  " " " " 
## 6  ( 1 )  " " "*" " " " " " " " " " "   " "  "*"  " " " " 
## 7  ( 1 )  " " " " "*" " " " " " " " "   " "  " "  "*" " " 
## 8  ( 1 )  " " " " "*" " " " " " " " "   " "  " "  "*" " " 
## 9  ( 1 )  " " " " "*" " " " " " " "*"   " "  " "  "*" " " 
## 10  ( 1 ) " " "*" "*" " " " " " " " "   " "  " "  "*" " " 
## 11  ( 1 ) " " " " " " " " " " " " " "   " "  " "  " " " " 
## 12  ( 1 ) " " "*" "*" " " "*" " " "*"   " "  " "  "*" " " 
## 13  ( 1 ) " " "*" "*" " " "*" " " "*"   " "  "*"  " " " " 
## 14  ( 1 ) " " "*" "*" " " "*" " " "*"   " "  "*"  " " " " 
## 15  ( 1 ) "*" "*" " " " " " " " " " "   " "  " "  " " " " 
## 16  ( 1 ) " " "*" "*" " " "*" " " "*"   " "  "*"  " " "*" 
## 17  ( 1 ) " " "*" "*" " " "*" " " "*"   " "  "*"  " " "*" 
## 18  ( 1 ) " " "*" "*" " " "*" " " "*"   "*"  " "  "*" "*" 
## 19  ( 1 ) "*" "*" "*" "*" "*" "*" " "   " "  " "  " " " " 
## 20  ( 1 ) "*" "*" "*" "*" "*" "*" "*"   " "  " "  "*" " " 
## 21  ( 1 ) "*" "*" "*" " " "*" "*" "*"   "*"  "*"  "*" " " 
## 22  ( 1 ) "*" "*" "*" "*" "*" "*" "*"   "*"  "*"  " " " " 
## 23  ( 1 ) "*" "*" "*" "*" "*" "*" "*"   "*"  "*"  "*" " " 
## 24  ( 1 ) "*" "*" "*" "*" "*" "*" "*"   "*"  "*"  "*" "*"
plot(model_select, scale = "bic", main = "BIC")

data.frame(
  Adj.R2 = which.max(model_select_summary$adjr2),
  CP = which.min(model_select_summary$cp),
  BIC = which.min(model_select_summary$bic)
)
coef(model_select,which.min(model_select_summary$adjr2))
##  (Intercept)           mp 
## 13.729582488  0.001058354
coef(model_select,which.min(model_select_summary$cp))
##      (Intercept) nba_draft_number              age                g 
##     10.506881965     -0.022087641      0.099108353     -0.005630508 
##               mp              per               ts              trb 
##      0.001079432     -0.158889718      2.959813795      0.063559521 
##              ast              tov              usg              dws 
##      0.016280400     -0.021765149      0.072773966     -0.241306723 
##            ws_48             dbpm 
##      5.659374488      0.114845569
coef(model_select,which.min(model_select_summary$bic))
##      (Intercept) nba_draft_number              age               mp 
##    11.7177685858    -0.0226718307     0.0995664958     0.0007887849 
##              drb 
##     0.0245851796

“All models are wrong, some models are useful”, Box, G.E.P

# adjR2 model

nba_r2 <- lm(salary~ mp , data =data_train)
summary(nba_r2)
## 
## Call:
## lm(formula = salary ~ mp, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1998 -0.7103  0.1219  0.7393  3.4727 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.373e+01  9.788e-02   140.3   <2e-16 ***
## mp          1.058e-03  6.873e-05    15.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.206 on 469 degrees of freedom
## Multiple R-squared:  0.3358, Adjusted R-squared:  0.3344 
## F-statistic: 237.1 on 1 and 469 DF,  p-value: < 2.2e-16
# CP model

nba_cp <- lm(salary~ nba_draft_number+age+mp+per+ts+f_tr+trb+ast+tov+usg+dws+ws_48+dbpm, data =data_train)
summary(nba_cp)
## 
## Call:
## lm(formula = salary ~ nba_draft_number + age + mp + per + ts + 
##     f_tr + trb + ast + tov + usg + dws + ws_48 + dbpm, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5542 -0.5648  0.0041  0.6197  3.4279 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.3750847  0.5164161  20.091  < 2e-16 ***
## nba_draft_number -0.0220209  0.0024562  -8.966  < 2e-16 ***
## age               0.0984591  0.0110879   8.880  < 2e-16 ***
## mp                0.0009087  0.0001215   7.481 3.81e-13 ***
## per              -0.1560700  0.0380094  -4.106 4.77e-05 ***
## ts                2.8888794  0.8035310   3.595 0.000359 ***
## f_tr             -0.1972043  0.1667041  -1.183 0.237440    
## trb               0.0643382  0.0154437   4.166 3.71e-05 ***
## ast               0.0165634  0.0074575   2.221 0.026837 *  
## tov              -0.0189888  0.0090615  -2.096 0.036672 *  
## usg               0.0751283  0.0196126   3.831 0.000146 ***
## dws              -0.2223729  0.1080031  -2.059 0.040065 *  
## ws_48             5.5865320  1.6908769   3.304 0.001028 ** 
## dbpm              0.1059941  0.0337250   3.143 0.001782 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.007 on 457 degrees of freedom
## Multiple R-squared:  0.5486, Adjusted R-squared:  0.5357 
## F-statistic: 42.72 on 13 and 457 DF,  p-value: < 2.2e-16
# BIC model

nba_bic <- lm(salary~ nba_draft_number+age+mp+drb, data =data_train)
summary(nba_bic)
## 
## Call:
## lm(formula = salary ~ nba_draft_number + age + mp + drb, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5796 -0.5286  0.0462  0.6092  3.0812 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.172e+01  3.354e-01  34.940  < 2e-16 ***
## nba_draft_number -2.267e-02  2.432e-03  -9.323  < 2e-16 ***
## age               9.957e-02  1.110e-02   8.968  < 2e-16 ***
## mp                7.888e-04  6.284e-05  12.551  < 2e-16 ***
## drb               2.459e-02  7.071e-03   3.477 0.000555 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.024 on 466 degrees of freedom
## Multiple R-squared:  0.5238, Adjusted R-squared:  0.5197 
## F-statistic: 128.2 on 4 and 466 DF,  p-value: < 2.2e-16
# Prediction

# adjR2
predict_r2 <- predict(nba_r2,newdata = data_test)
cbind(predict_r2,data_test$salary)
##     predict_r2         
## 8     13.85447 10.73813
## 27    15.43036 15.00640
## 32    14.67893 13.61170
## 44    14.80910 16.55332
## 144   14.98902 14.20171
## 183   14.07249 14.22401
## 434   13.83436 14.71210
## 436   13.83330 14.10156
## 457   13.82801 11.43128
## 480   15.67484 15.52106
exp(cbind(predict_r2,data_test$salary))
##     predict_r2         
## 8      1039726    46080
## 27     5027115  3290000
## 32     2371245   815615
## 44     2700920 15453126
## 144    3233331  1471382
## 183    1293017  1504560
## 434    1019028  2451225
## 436    1017950  1331160
## 457    1012577    92160
## 480    6419408  5504420
mean((data_test$salary-predict_r2)^2)
## [1] 2.132571
sqrt(mean((data_test$salary-predict_r2)^2))
## [1] 1.460332
# CP
predict_cp <- predict(nba_cp,newdata = data_test)
cbind(predict_cp,data_test$salary)
##     predict_cp         
## 8     12.75007 10.73813
## 27    16.19503 15.00640
## 32    14.00431 13.61170
## 44    15.56626 16.55332
## 144   14.56405 14.20171
## 183   13.76261 14.22401
## 434   13.75832 14.71210
## 436   14.07144 14.10156
## 457   12.97999 11.43128
## 480   15.72969 15.52106
exp(cbind(predict_cp,data_test$salary))
##     predict_cp         
## 8     344577.7    46080
## 27  10799662.8  3290000
## 32   1207803.7   815615
## 44   5758891.0 15453126
## 144  2113916.5  1471382
## 183   948474.6  1504560
## 434   944415.7  2451225
## 436  1291664.2  1331160
## 457   433648.9    92160
## 480  6781374.8  5504420
mean((data_test$salary-predict_cp)^2)
## [1] 1.028599
sqrt(mean((data_test$salary-predict_cp)^2))
## [1] 1.014199
# BIC
predict_bic <- predict(nba_bic,newdata = data_test)
cbind(predict_bic,data_test$salary)
##     predict_bic         
## 8      12.81815 10.73813
## 27     16.08326 15.00640
## 32     13.91118 13.61170
## 44     15.60099 16.55332
## 144    14.74093 14.20171
## 183    13.92808 14.22401
## 434    13.91641 14.71210
## 436    14.14277 14.10156
## 457    12.97544 11.43128
## 480    15.80329 15.52106
exp(cbind(predict_bic,data_test$salary))
##     predict_bic         
## 8      368850.6    46080
## 27    9657631.3  3290000
## 32    1100400.1   815615
## 44    5962422.9 15453126
## 144   2522917.6  1471382
## 183   1119148.8  1504560
## 434   1106169.2  2451225
## 436   1387156.7  1331160
## 457    431680.4    92160
## 480   7299291.2  5504420
mean((data_test$salary-predict_bic)^2)
## [1] 0.9959934
sqrt(mean((data_test$salary-predict_bic)^2))
## [1] 0.9979947